To compare the performance of varKode to Skmer, we will use leave-one-out cross validation: we remove one sample from the dataset, train a varKode model or make a skmer reference with the remaining samples, and then use the sample left out as query. We then record whether or not we correctly identify this sample in varKoder, and whether or not the closest sample with Skmer has the same identification.
For traditional barcodes, we assembled the genome of each sample, and then used BLAST to search for each of the traditional barcode genes. We recorded if we could find this gene in the assembly, coding as missing data if we could not. We then recorded whether the best BLAST hit for a sample was the correct species.
rm(list=ls())
library(tidyverse)
── Attaching core tidyverse packages ────────────────── tidyverse 2.0.0 ──
✔ dplyr 1.1.2 ✔ readr 2.1.4
✔ forcats 1.0.0 ✔ stringr 1.5.0
✔ ggplot2 3.4.3 ✔ tibble 3.2.1
✔ lubridate 1.9.2 ✔ tidyr 1.3.0
✔ purrr 1.0.2 ── Conflicts ──────────────────────────────────── tidyverse_conflicts() ──
✖ dplyr::filter() masks stats::filter()
✖ dplyr::lag() masks stats::lag()
ℹ Use the ]8;;http://conflicted.r-lib.org/conflicted package]8;; to force all conflicts to become errors
library(future)
library(ggthemes)
library(patchwork)
library(cowplot)
Attaching package: ‘cowplot’
The following object is masked from ‘package:patchwork’:
align_plots
The following object is masked from ‘package:ggthemes’:
theme_map
The following object is masked from ‘package:lubridate’:
stamp
library(patchwork)
For VarKoder, we used leave-one-out cross-validation to test the accuracy for family, genera, species in the joint Malpighiaceae-Chrysobalanaceae dataset. We used as input data varKodes produced from kmers of size 7 and 500Kbp to 200Mbp of data, or all of the data available if less than 200 Mbp. For each sample, we built a model using as input data from all other samples. Then we queried the sample left out, using as input the images generated from 500Kb to the total data available. Now we will summarize the results.
In this test, we used varKoder v0.6.0. Let’s process the results.
read_and_process_xval = function(infolder){
plan(multisession(workers = 12))
varkoder_results = list.files(infolder,
'predictions.csv',
recursive=T,
full.names = T) %>%
furrr::future_map_dfr(~read_csv(.x) %>% mutate(sample_id = as.character(sample_id))) %>%
select(-1) %>%
filter(str_detect(query_basepairs,'^0+[125]0+K$')) %>% #we will ignore queries that are not standardized sizes
rename(query_bp = query_basepairs) %>%
mutate(quality_included = T)
plan(sequential)
all_taxlabels = str_remove(varkoder_results$actual_labels,";*low_quality:True;*") %>% str_split(';') %>% unlist %>% unique
varkoder_results = varkoder_results %>%
mutate(query_labels = str_remove(actual_labels,";*low_quality:True;*") %>% str_split(';'),
predicted_list = str_split(predicted_labels,';')
) %>%
rowwise() %>%
mutate(family_correct = query_labels[str_detect(query_labels,'family')] %in% predicted_list,
genus_correct = query_labels[str_detect(query_labels,'genus')] %in% predicted_list,
species_correct = ifelse(any(str_detect(query_labels,'species')),
query_labels[str_detect(query_labels,'species')] %in% predicted_list,
NA
),
family_incorrect = any(!(predicted_list[str_detect(predicted_list,'family')] %in% query_labels[str_detect(query_labels,'family')])),
genus_incorrect = any(!(predicted_list[str_detect(predicted_list,'genus')] %in% query_labels[str_detect(query_labels,'genus')])),
species_incorrect = ifelse(any(str_detect(query_labels,'species')),
any(!(predicted_list[str_detect(predicted_list,'species')] %in% query_labels[str_detect(query_labels,'species')])),
NA
)
)
return(varkoder_results)
}
summarize_results = function(res,level){
res = res %>%
ungroup() %>%
mutate(low_quality = str_detect(actual_labels,"low_quality:True"),
result = as.character(ifelse(res[,str_c(level,'correct',sep='_')] & !res[,str_c(level,'incorrect',sep='_')], 'correct',
ifelse(res[,str_c(level,'correct',sep='_')] & res[,str_c(level,'incorrect',sep='_')], 'ambiguous',
ifelse(!res[,str_c(level,'correct',sep='_')] & res[,str_c(level,'incorrect',sep='_')], 'incorrect',
'inconclusive'
))))
) %>%
filter(!is.na(result)) %>%
group_by(query_bp,result) %>%
summarise(N=n(), .groups = 'drop') %>%
group_by(query_bp) %>%
mutate(p= N/sum(N)) %>%
mutate(query_bp = as.integer(str_remove(query_bp,'K'))*1000) %>%
ungroup() %>%
mutate(query_bp = as.factor(query_bp)) %>%
complete(query_bp,result, fill = list(p = 0, N = 0)) %>%
mutate(query_bp = as.numeric(as.character(query_bp))) %>%
ungroup()
return(res)
}
plot_area = function(sum_df, title, relative = FALSE, grid = TRUE){
breaks = c(500000,
1000000,
2000000,
5000000,
10000000,
20000000,
50000000,
100000000,
200000000
)
xlimits = range(breaks)
sum_df = sum_df %>%
mutate(result = factor(result,ordered = T, levels = c('correct','ambiguous','inconclusive','incorrect')))
if (relative){
ylimits = c(0,1)
} else {
ylimits = c(0,sum_df %>% group_by(query_bp) %>% summarize(N=sum(N)) %>% pull(N) %>% max)
}
# Get colors from a Color Brewer palette
brewer_colors <- RColorBrewer::brewer.pal(4, "Accent")
if (relative) {
p1 = ggplot(sum_df, aes(x=query_bp,y=p,fill=result)) +
geom_area(position='stack') +
scale_fill_manual(values = setNames(brewer_colors, c("correct", "ambiguous", "inconclusive", "incorrect"))) +
scale_alpha_manual(values=c(0.5,1)) +
scale_x_log10(labels = scales::label_number(scale_cut = scales::cut_si('bp')),breaks = breaks) +
scale_y_continuous() +
ggtitle(title) +
ylab('Fraction of samples') +
xlab('Base pairs in query images') +
theme_few() +
theme(axis.text.x = element_text(hjust=1,angle=45))
} else {
p1 = ggplot(sum_df, aes(x=query_bp,y=N,fill=result)) +
geom_area(position='stack') +
scale_fill_manual(values = setNames(brewer_colors, c("correct", "ambiguous", "inconclusive", "incorrect"))) +
scale_alpha_manual(values=c(0.5,1)) +
scale_x_log10(labels = scales::label_number(scale_cut = scales::cut_si('bp')),breaks = breaks) +
scale_y_continuous() +
ggtitle(title) +
ylab('Number of samples') +
xlab('Base pairs in query images') +
theme_few() +
theme(axis.text.x = element_text(hjust=1,angle=45))
}
if (grid){
p1 = p1 +
scale_y_continuous(n.breaks = 10, minor_breaks = waiver()) +
theme(panel.background = element_rect(fill = NA),
panel.grid.major.y = element_line(colour = gray(0.5)),
panel.grid.minor.y = element_line(colour = gray(0.6),linetype = 2),
panel.ontop = TRUE)
}
p1 = p1 + coord_cartesian(xlim=xlimits, ylim=ylimits,expand = FALSE)
return(p1)
}
Now let’s plot genus-level accuracy for a model taking quality labels into account:
results = read_and_process_xval('Malpighiaceae+Chrysobalanaceae/varKoder/vit_results/')
summary_genus = summarize_results(results,'genus')
p_genus = plot_area(summary_genus, 'varKoder genus', relative = TRUE)
p_genus
Now the same but with species
summary_species = summarize_results(results,'species')
p_species = plot_area(summary_species, 'varKoder species', relative = TRUE)
Scale for y is already present.
Adding another scale for y, which will replace the existing scale.
p_species
Finally, family
summary_family = summarize_results(results,'family')
p_family = plot_area(summary_family, 'varKoder family', relative = TRUE)
Scale for y is already present.
Adding another scale for y, which will replace the existing scale.
p_family
Now we will try to identify which samples failed and why they failed. Particuarly, how do DNA quality, amount of data, and the number of samples per class impact results? We will use genus-level predictions to test.
genus_predictions = results %>%
mutate(predicted_genus = str_extract(predicted_labels, 'genus:[^;]*'),
actual_genus = str_extract(actual_labels, 'genus:[^;]*')) %>%
select(-starts_with('family'),-starts_with('species')) %>%
pivot_longer(cols = starts_with("genus"), names_to = "predicted_label", values_to = "confidence") %>%
filter(actual_genus == predicted_label) %>%
select(query_bp, sample_id, basefrequency_sd, actual_genus, confidence) %>%
mutate(query_bp = 1000*(str_remove(query_bp, "K") %>% as.integer))
genus_predictions = genus_predictions %>%
select(sample_id, actual_genus) %>%
distinct() %>%
group_by(actual_genus) %>%
summarise(N_samples = n()) %>%
right_join(genus_predictions)
Joining with `by = join_by(actual_genus)`
genus_predictions
Now let’s make some plots. First, what is the effect of number of samples per class in confidence?
plot_genus_N_vs_conf = ggplot(genus_predictions, aes(x = N_samples-1,
y = confidence)) +
scale_color_viridis_c() +
geom_jitter(alpha=0.3) +
scale_x_log10() +
#ylab('Confidence in correct prediction\n(logit scale)') +
ylab('Confidence in correct prediction') +
xlab('Number of samples in correct genus\n(log scale)') +
#scale_y_continuous(trans = "logit", breaks = c(1e-4,0.001,0.01,0.1,0.25,0.5,0.75,0.9,0.99,0.999,1-1e-4)) +
scale_y_continuous(limits=c(0,1)) +
theme_few() +
theme(panel.grid.major.y = element_line(colour = gray(0.8)))
plot_genus_N_vs_conf
Now, what is the effect of sample quality in confidence?
plot_genus_freqsd_vs_conf = ggplot(genus_predictions, aes(x = basefrequency_sd, y = confidence)) +
geom_point(alpha=0.3) +
scale_x_log10() +
#scale_y_continuous(trans = "logit", breaks = c(1e-4,0.001,0.01,0.1,0.25,0.5,0.75,0.9,0.99,0.999,1-1e-4)) +
scale_y_continuous(limits=c(0,1)) +
#ylab('Confidence in correct prediction\n(logit scale)') +
ylab('Confidence in correct prediction') +
xlab('Standard deviation of base frequencies') +
theme_few() +
theme(panel.grid.major.y = element_line(colour = gray(0.8)))
plot_genus_freqsd_vs_conf
Now, what is the effect of amount of data in confidence?
plot_genus_bp_vs_conf = ggplot(genus_predictions, aes(x = query_bp, y = confidence)) +
geom_jitter(alpha=0.3) +
#scale_y_continuous(trans = "logit", breaks = c(1e-4,0.001,0.01,0.1,0.25,0.5,0.75,0.9,0.99,0.999,1-1e-4)) +
scale_y_continuous(limits=c(0,1)) +
#ylab('Confidence in correct prediction\n(logit scale)') +
ylab('Confidence in correct prediction') +
xlab('Base pairs in query images\n(log scale)') +
scale_x_log10() +
theme_few() +
theme(panel.grid.major.y = element_line(colour = gray(0.8)))
plot_genus_bp_vs_conf
Now let’s save the three of them as a single plot using cowplot.
combined_conf = patchwork::wrap_plots(plot_genus_N_vs_conf + theme(text = element_text(size=8)),
plot_genus_bp_vs_conf + theme(axis.title.y=element_blank(),
axis.text.y=element_blank(),
text = element_text(size=8)),
plot_genus_freqsd_vs_conf + theme(axis.title.y=element_blank(),
axis.text.y=element_blank(),
text = element_text(size=8))) +
patchwork::plot_annotation(tag_levels = 'A')
combined_conf
ggsave(filename = 'images_manuscript/supp_conf_predictors.pdf',device = 'pdf',width = 7,height=3,units = 'in',useDingbats=F)
Let’s put it all together now in a linear model:
lm_data = genus_predictions %>%
mutate(confidence = ifelse(confidence == 1, confidence-0.0000001, confidence),
confidence = car::logit(confidence)) %>%
mutate(query_bp = (query_bp - mean(query_bp))/sd(query_bp),
basefrequency_sd = (basefrequency_sd - mean(basefrequency_sd))/sd(basefrequency_sd),
N_samples = (N_samples - mean(N_samples))/sd(N_samples)
)
full_model = lm(formula = confidence~query_bp*basefrequency_sd*N_samples, data = lm_data)
full_model
Call:
lm(formula = confidence ~ query_bp * basefrequency_sd * N_samples,
data = lm_data)
Coefficients:
(Intercept) query_bp
4.79835 0.07638
basefrequency_sd N_samples
-0.75108 1.69980
query_bp:basefrequency_sd query_bp:N_samples
-0.04625 -0.11011
basefrequency_sd:N_samples query_bp:basefrequency_sd:N_samples
-0.25966 -0.38270
summary(full_model)
Call:
lm(formula = confidence ~ query_bp * basefrequency_sd * N_samples,
data = lm_data)
Residuals:
Min 1Q Median 3Q Max
-17.3636 -0.9435 0.4652 1.4977 5.3559
Coefficients:
Estimate Std. Error t value Pr(>|t|)
(Intercept) 4.79835 0.05705 84.110 < 2e-16
query_bp 0.07638 0.07636 1.000 0.3173
basefrequency_sd -0.75108 0.10965 -6.850 9.49e-12
N_samples 1.69980 0.06023 28.223 < 2e-16
query_bp:basefrequency_sd -0.04625 0.19985 -0.231 0.8170
query_bp:N_samples -0.11011 0.08252 -1.334 0.1822
basefrequency_sd:N_samples -0.25966 0.13175 -1.971 0.0489
query_bp:basefrequency_sd:N_samples -0.38270 0.23793 -1.608 0.1079
(Intercept) ***
query_bp
basefrequency_sd ***
N_samples ***
query_bp:basefrequency_sd
query_bp:N_samples
basefrequency_sd:N_samples *
query_bp:basefrequency_sd:N_samples
---
Signif. codes: 0 ‘***’ 0.001 ‘**’ 0.01 ‘*’ 0.05 ‘.’ 0.1 ‘ ’ 1
Residual standard error: 2.351 on 2251 degrees of freedom
Multiple R-squared: 0.4207, Adjusted R-squared: 0.4189
F-statistic: 233.5 on 7 and 2251 DF, p-value: < 2.2e-16
plot(full_model)
reduced_model = step(full_model, direction ="both")
Start: AIC=3869.65
confidence ~ query_bp * basefrequency_sd * N_samples
Df Sum of Sq RSS AIC
<none> 12439 3869.7
- query_bp:basefrequency_sd:N_samples 1 14.297 12453 3870.2
reduced_model
Call:
lm(formula = confidence ~ query_bp * basefrequency_sd * N_samples,
data = lm_data)
Coefficients:
(Intercept) query_bp
4.79835 0.07638
basefrequency_sd N_samples
-0.75108 1.69980
query_bp:basefrequency_sd query_bp:N_samples
-0.04625 -0.11011
basefrequency_sd:N_samples query_bp:basefrequency_sd:N_samples
-0.25966 -0.38270
summary(reduced_model)
Call:
lm(formula = confidence ~ query_bp * basefrequency_sd * N_samples,
data = lm_data)
Residuals:
Min 1Q Median 3Q Max
-17.3636 -0.9435 0.4652 1.4977 5.3559
Coefficients:
Estimate Std. Error t value Pr(>|t|)
(Intercept) 4.79835 0.05705 84.110 < 2e-16
query_bp 0.07638 0.07636 1.000 0.3173
basefrequency_sd -0.75108 0.10965 -6.850 9.49e-12
N_samples 1.69980 0.06023 28.223 < 2e-16
query_bp:basefrequency_sd -0.04625 0.19985 -0.231 0.8170
query_bp:N_samples -0.11011 0.08252 -1.334 0.1822
basefrequency_sd:N_samples -0.25966 0.13175 -1.971 0.0489
query_bp:basefrequency_sd:N_samples -0.38270 0.23793 -1.608 0.1079
(Intercept) ***
query_bp
basefrequency_sd ***
N_samples ***
query_bp:basefrequency_sd
query_bp:N_samples
basefrequency_sd:N_samples *
query_bp:basefrequency_sd:N_samples
---
Signif. codes: 0 ‘***’ 0.001 ‘**’ 0.01 ‘*’ 0.05 ‘.’ 0.1 ‘ ’ 1
Residual standard error: 2.351 on 2251 degrees of freedom
Multiple R-squared: 0.4207, Adjusted R-squared: 0.4189
F-statistic: 233.5 on 7 and 2251 DF, p-value: < 2.2e-16
plot(reduced_model)
For skmer, we left each sample out, built a reference and then queried that sample. We have several files in which reference samples are ordered by their distance to the query, we here we will evaluate whether the closest sample is from the correct species or genus.
Because it is not clear how skmer behaves for different levels of coverage, we repeated this for several input sizes (in number of basepairs) as query, but always used the maximum input dize available (up to 200Mb) for references.
Let’s make a function that extracts these results as a table.
samp_labels = results %>% select(sample_id,actual_labels) %>% distinct()
extract_skmer_results = function(file_path) {
# Read only the first 2 lines of the file
file_lines <- readLines(file_path, n = 2)
# Extract sample_ID, basepairs from the first line
sample_info <- str_match(file_lines[1], "\\s*(.*?)@(\\d+K)")[, 2:3]
sample_ID <- sample_info[1]
basepairs <- sample_info[2]
# Extract reference_sample_ID, distance from the second line
reference_info <- str_match(file_lines[2], "\\s*(.*?)@.*\\s+(\\d+\\.\\d+)")[, 2:3]
reference_sample_ID <- reference_info[1]
distance <- as.numeric(reference_info[2])
# Create a tibble
tibble(
sample_id = sample_ID,
query_bp = basepairs,
closest_reference_sample_id = reference_sample_ID,
closest_distance = distance
)
}
Now we will apply this function to all skmer output files.
plan(multisession(workers = 12))
skmer_results_df = furrr::future_map_dfr(
list.files('Malpighiaceae+Chrysobalanaceae/skmer/skmer_xval_results/', full.names = T),
~ extract_skmer_results(.x)
) %>%
left_join(samp_labels, by = 'sample_id') %>%
left_join(
samp_labels %>% select(
closest_reference_sample_id = 'sample_id',
predicted_labels = actual_labels
),
by = 'closest_reference_sample_id'
) %>%
mutate(
query_labels = str_remove(actual_labels, ";*low_quality:True;*") %>% str_split(';'),
predicted_list = str_split(predicted_labels, ';')
) %>%
rowwise() %>%
mutate(
family_correct = query_labels[str_detect(query_labels, 'family')] %in% predicted_list,
genus_correct = query_labels[str_detect(query_labels, 'genus')] %in% predicted_list,
species_correct = ifelse(any(str_detect(
query_labels, 'species'
)),
query_labels[str_detect(query_labels, 'species')] %in% predicted_list,
NA),
family_incorrect = any(!(predicted_list[str_detect(predicted_list, 'family')] %in% query_labels[str_detect(query_labels, 'family')])),
genus_incorrect = any(!(predicted_list[str_detect(predicted_list, 'genus')] %in% query_labels[str_detect(query_labels, 'genus')])),
species_incorrect = ifelse(any(str_detect(
query_labels, 'species'
)),
any(!(
predicted_list[str_detect(predicted_list, 'species')] %in% query_labels[str_detect(query_labels, 'species')]
)),
NA)
)
plan(sequential)
skmer_results_df
Now let’s summarize and plot by genus:
skmer_summary_genus = summarize_results(skmer_results_df,'genus')
p_skmer_genus = plot_area(skmer_summary_genus, 'Skmer genus', relative = TRUE)
Scale for y is already present.
Adding another scale for y, which will replace the existing scale.
p_skmer_genus
Now by species. In Skmer, there is no inconclusive result: if there is no correct species prediction, it means that a sample was predicted in the wrong genus and therefore it is incorrect
skmer_summary_species = summarize_results(skmer_results_df,'species') %>%
mutate(result = ifelse(result == 'correct', 'correct','incorrect')) %>%
group_by(query_bp,result) %>%
summarise_all(sum)
p_skmer_species = plot_area(skmer_summary_species, 'Skmer species', relative = TRUE)
Scale for y is already present.
Adding another scale for y, which will replace the existing scale.
p_skmer_species
And now by family:
skmer_summary_family = summarize_results(skmer_results_df,'family')
skmer_summary_family
p_skmer_family = plot_area(skmer_summary_family, 'Skmer family', relative = TRUE)
Scale for y is already present.
Adding another scale for y, which will replace the existing scale.
p_skmer_family
Let’s now read the traditional barcode results and summarize them in the same way as skmer and varKoder. Let’s start by defining a fuction that reads the data so we can summarize it using the previously defined functions.
read_traditional_barcodes = function(bp) {
input_file = paste0(
'Malpighiaceae+Chrysobalanaceae/traditional_barcodes/2_blast_phylogeny_result/Genus/',
bp,
'M_blast_phylo_sum_sp.tsv'
)
barcode_res = read_delim(input_file) %>%
pivot_longer(-sp, names_to = 'marker', values_to = 'closest_reference_sample_id') %>%
rename(sample_id = 'sp') %>%
mutate(
sample_id = str_remove_all(sample_id, '@.+'),
closest_reference_sample_id = str_remove_all(closest_reference_sample_id, '@.+'),
predicted_labels = samp_labels$actual_labels[match(closest_reference_sample_id, samp_labels$sample_id)],
actual_labels = samp_labels$actual_labels[match(sample_id, samp_labels$sample_id)]
) %>%
filter(marker != 'Concatenated_phylogeny') %>%
mutate(
query_labels = str_remove(actual_labels, ";*low_quality:True;*") %>% str_split(';'),
predicted_list = str_split(predicted_labels, ';')
) %>%
rowwise() %>%
mutate(
family_correct = query_labels[str_detect(query_labels, 'family')] %in% predicted_list,
genus_correct = query_labels[str_detect(query_labels, 'genus')] %in% predicted_list,
species_correct = ifelse(any(str_detect(
query_labels, 'species'
)),
query_labels[str_detect(query_labels, 'species')] %in% predicted_list,
NA),
family_incorrect = any(!(predicted_list[str_detect(predicted_list, 'family')] %in% query_labels[str_detect(query_labels, 'family')])),
genus_incorrect = any(!(predicted_list[str_detect(predicted_list, 'genus')] %in% query_labels[str_detect(query_labels, 'genus')])),
species_incorrect = ifelse(any(str_detect(
query_labels, 'species'
)),
any(!(
predicted_list[str_detect(predicted_list, 'species')] %in% query_labels[str_detect(query_labels, 'species')]
)),
NA)
) %>%
mutate_at(vars(ends_with("_correct"), ends_with("_incorrect")),
~ ifelse(is.na(predicted_labels) & !is.na(.), FALSE, .)) %>%
mutate(query_bp = bp * 1e3)
return(barcode_res)
}
Now we can apply this function to all of our results:
results_barcodes = purrr::map_dfr(c(10,20,50,100,200),read_traditional_barcodes)
Rows: 288 Columns: 7── Column specification ──────────────────────────────────────────────────
Delimiter: "\t"
chr (7): sp, matK, rbcL, ndhF, trnL-F, ITS, Concatenated_phylogeny
ℹ Use `spec()` to retrieve the full column specification for this data.
ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.Rows: 285 Columns: 7── Column specification ──────────────────────────────────────────────────
Delimiter: "\t"
chr (7): sp, matK, rbcL, ndhF, trnL-F, ITS, Concatenated_phylogeny
ℹ Use `spec()` to retrieve the full column specification for this data.
ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.Rows: 267 Columns: 7── Column specification ──────────────────────────────────────────────────
Delimiter: "\t"
chr (7): sp, matK, rbcL, ndhF, trnL-F, ITS, Concatenated_phylogeny
ℹ Use `spec()` to retrieve the full column specification for this data.
ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.Rows: 200 Columns: 7── Column specification ──────────────────────────────────────────────────
Delimiter: "\t"
chr (7): sp, matK, rbcL, ndhF, trnL-F, ITS, Concatenated_phylogeny
ℹ Use `spec()` to retrieve the full column specification for this data.
ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.Rows: 166 Columns: 7── Column specification ──────────────────────────────────────────────────
Delimiter: "\t"
chr (7): sp, matK, rbcL, ndhF, trnL-F, ITS, Concatenated_phylogeny
ℹ Use `spec()` to retrieve the full column specification for this data.
ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
results_barcodes
Now let’s summarise for each marker separately:
barcode_summary_family = split(results_barcodes,results_barcodes$marker) %>%
purrr::map_dfr(~summarize_results(.x,'family'),.id='marker')
barcode_summary_family
barcode_summary_genus = split(results_barcodes,results_barcodes$marker) %>%
purrr::map_dfr(~summarize_results(.x,'genus'),.id='marker')
barcode_summary_genus
barcode_summary_species = split(results_barcodes,results_barcodes$marker) %>%
purrr::map_dfr(~summarize_results(.x,'species'),.id='marker')
barcode_summary_species
Now let’s plot, making separate plots for each marker:
Species:
p_barcode_species = barcode_summary_species %>%
split(barcode_summary_species$marker) %>%
purrr::map(~plot_area(.x,paste0(unique(.x$marker),' species'), relative = TRUE))
Scale for y is already present.
Adding another scale for y, which will replace the existing scale.Scale for y is already present.
Adding another scale for y, which will replace the existing scale.Scale for y is already present.
Adding another scale for y, which will replace the existing scale.Scale for y is already present.
Adding another scale for y, which will replace the existing scale.Scale for y is already present.
Adding another scale for y, which will replace the existing scale.
p_barcode_species
$ITS
$matK
$ndhF
$rbcL
$`trnL-F`
Genera:
p_barcode_genus = barcode_summary_genus %>%
split(barcode_summary_genus$marker) %>%
purrr::map(~plot_area(.x,paste0(unique(.x$marker),' genus'), relative = TRUE))
Scale for y is already present.
Adding another scale for y, which will replace the existing scale.Scale for y is already present.
Adding another scale for y, which will replace the existing scale.Scale for y is already present.
Adding another scale for y, which will replace the existing scale.Scale for y is already present.
Adding another scale for y, which will replace the existing scale.Scale for y is already present.
Adding another scale for y, which will replace the existing scale.
p_barcode_genus
$ITS
$matK
$ndhF
$rbcL
$`trnL-F`
Family:
p_barcode_family = barcode_summary_family %>%
split(barcode_summary_family$marker) %>%
purrr::map(~plot_area(.x,paste0(unique(.x$marker),' family'), relative = TRUE))
Scale for y is already present.
Adding another scale for y, which will replace the existing scale.Scale for y is already present.
Adding another scale for y, which will replace the existing scale.Scale for y is already present.
Adding another scale for y, which will replace the existing scale.Scale for y is already present.
Adding another scale for y, which will replace the existing scale.Scale for y is already present.
Adding another scale for y, which will replace the existing scale.
p_barcode_family
$ITS
$matK
$ndhF
$rbcL
$`trnL-F`
Now let’s compare methods side by side. For genus level:
reformat_graphs = function(x){
x +
theme(axis.title.x = element_blank(),
axis.text.x = element_blank(),
axis.ticks.x = element_blank()) +
labs(title = sub(" (genus|species|family)$", "", x$labels$title))
}
p = patchwork::wrap_plots(reformat_graphs(p_genus),
reformat_graphs(p_skmer_genus),
reformat_graphs(p_barcode_genus$ITS),
p_barcode_genus$rbcL +labs(title = sub(" genus$", "", p_barcode_genus$rbcL$labels$title)),
ncol = 1) +
plot_annotation(title = 'Genus-level accuracy')
p
ggsave('images_manuscript/fig3_genus_accuracy.pdf', width=4.5,height = 10)
ggsave('images_manuscript/fig3_genus_accuracy.png', width=4.5,height = 10,dpi=1200)
Now for species level:
p = patchwork::wrap_plots(reformat_graphs(p_species),
reformat_graphs(p_skmer_species),
reformat_graphs(p_barcode_species$ITS),
p_barcode_species$rbcL + labs(title = sub(" species$", "", p_barcode_species$rbcL$labels$title)),
ncol = 1) +
plot_annotation(title = 'Species-level accuracy')
p
ggsave('images_manuscript/fig3_species_accuracy.pdf', width=4.5,height = 10)
ggsave('images_manuscript/fig3_species_accuracy.png', width=4.5,height = 10,dpi=1200)
Now for species level:
p = patchwork::wrap_plots(reformat_graphs(p_family),
reformat_graphs(p_skmer_family),
reformat_graphs(p_barcode_family$ITS),
p_barcode_family$rbcL + labs(title = sub(" family$", "", p_barcode_family$rbcL$labels$title)),
ncol = 1) +
plot_annotation(title = 'family-level accuracy')
p
ggsave('images_manuscript/fig3_family_accuracy.pdf', width=4.5,height = 10)
ggsave('images_manuscript/fig3_family_accuracy.png', width=4.5,height = 10,dpi=1200)
Now let’s compare the time to produce references and to produce
Finally, let’s summarize results for the whole SRA dataset. In this case, we only have varKoder since Skmer cannot finish and traditional barcodes are inapplicable.
varKoder_SRA_results = read_csv('all_SRA/varkoder_query_results/predictions.csv') %>%
select(-1) %>%
filter(str_detect(query_basepairs,'^0+[125]0+K$')) %>% #we will ignore queries that are not standardized sizes
rename(query_bp = query_basepairs) %>%
mutate(quality_included = T)
New names:Rows: 41103 Columns: 873── Column specification ──────────────────────────────────────────────────
Delimiter: ","
chr (7): varKode_image_path, sample_id, query_basepairs, query_kmer_...
dbl (865): ...1, prediction_threshold, actual_labels, basefrequency_sd...
lgl (1): possible_low_quality
ℹ Use `spec()` to retrieve the full column specification for this data.
ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
plan(sequential)
SRA_taxlabels = str_remove(varKoder_SRA_results$actual_labels,";*low_quality:True;*") %>% str_split(';') %>% unlist %>% unique
varKoder_SRA_results = varKoder_SRA_results %>%
mutate(query_labels = str_remove(actual_labels,";*low_quality:True;*") %>% str_split(';') %>% unlist,
predicted_list = str_split(predicted_labels,';')
) %>%
rowwise() %>%
mutate(family_correct = query_labels %in% predicted_list,
family_incorrect = ifelse(is.na(predicted_labels),FALSE,any(!(predicted_list %in% query_labels)))) %>%
select(matches("^[^0-9]"))
varKoder_SRA_results
NA
Now let’s summarize and plot:
SRA_summary_family = summarize_results(varKoder_SRA_results,'family')
SRA_summary_family
N_samp = SRA_summary_family %>%
group_by(query_bp) %>%
summarise(N = sum(N))
p_SRA_family = plot_area(SRA_summary_family, 'varKoder SRA family', relative = TRUE)
Scale for y is already present.
Adding another scale for y, which will replace the existing scale.
p_SRA_family
Let’s now do the SRA plot, but splitting by kingdom. First, we need to retrieve kingdom information:
p_SRA_families = read_csv('all_SRA/runs_to_download_data.csv') %>%
select(sample_id = Run, Kingdom) %>%
right_join(varKoder_SRA_results) %>%
split(.$Kingdom) %>%
purrr::map(summarize_results,
level='family') %>%
purrr::imap(~plot_area(.x,.y,relative=TRUE) + coord_cartesian(xlim=c(500,10000)*1000,expand = FALSE))
Warning: One or more parsing issues, call `problems()` on your data frame for
details, e.g.:
dat <- vroom(...)
problems(dat)Rows: 8264 Columns: 51── Column specification ──────────────────────────────────────────────────
Delimiter: ","
chr (28): Run, AssemblyName, download_path, Experiment, LibraryName, ...
dbl (11): spots, bases, spots_with_mates, avgLength, size_MB, InsertS...
lgl (10): g1k_pop_code, source, g1k_analysis_group, Subject_ID, Disea...
dttm (2): ReleaseDate, LoadDate
ℹ Use `spec()` to retrieve the full column specification for this data.
ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.Joining with `by = join_by(sample_id)`Scale for y is already present.
Adding another scale for y, which will replace the existing scale.Coordinate system already present. Adding new coordinate system, which
will replace the existing one.Scale for y is already present.
Adding another scale for y, which will replace the existing scale.Coordinate system already present. Adding new coordinate system, which
will replace the existing one.Scale for y is already present.
Adding another scale for y, which will replace the existing scale.Coordinate system already present. Adding new coordinate system, which
will replace the existing one.
p_SRA_families
$Fungi
$Metazoa
$Viridiplantae
Now let’s join to create a plot for publication:
remove_y_axis_and_scale = function(x){
x +
theme(axis.title.y = element_blank(),
axis.text.y = element_blank(),
axis.ticks.y = element_blank(),
legend.position='none',
axis.title.x = element_blank())
}
# Modify the last plot to have the x-axis label
p_SRA_families$Fungi <- p_SRA_families$Fungi + labs(x = "Base pairs in query images")
# Combine plots
p_combined <- wrap_plots(p_SRA_families$Metazoa +
theme(legend.position = 'none',
axis.title.x = element_blank()),
remove_y_axis_and_scale(p_SRA_families$Viridiplantae),
remove_y_axis_and_scale(p_SRA_families$Fungi),
nrow = 1)
# Add title and set layout
# Create a blank ggplot object with the desired x-axis title
x_title_plot <- ggplot() +
theme_void() +
labs(x = "Base pairs in query images") +
theme(plot.margin = margin(0, 0, 0, 0),
axis.title.x = element_text(size = 10, hjust = 0.5))
p = wrap_plots(p_combined, x_title_plot,ncol=1,heights = c(0.95,0.05))
print(p)
ggsave('images_manuscript/fig3_SRA_accuracy.pdf', width=4.5,height = 4)
ggsave('images_manuscript/fig3_SRA_accuracy.png', width=4.5,height = 4,dpi = 1200)
Here we just query our results to get a few figures that we report in the paper.
Total number of samples used in cross-validation:
dim(samp_labels)
[1] 287 2
Number of Stigmaphyllon samples with each kind of error for varkoder:
summary_species
Number of Stigmaphyllon samples with each kind of error for skmer:
skmer_summary_species
varKoder accuracy for genera:
summary_genus
varKoder accuracy for family:
summary_family
Skmer accuracy for genera:
skmer_summary_genus
Skmer accuracy for family:
skmer_summary_family
Number of samples available for each genus and data amount
results %>%
mutate(genus = str_extract(actual_labels,"(?<=genus:)[^;]+")) %>%
group_by(query_bp) %>%
summarize(N=n()) %>%
complete()
Plot number of samples for supplementary material.
n_samples_genera = results %>%
mutate(taxon = str_extract(actual_labels,"(?<=genus:)[^;]+")) %>%
group_by(taxon, query_bp) %>%
summarize(N=n()) %>%
ungroup() %>%
complete(taxon, query_bp, fill = list(N=0))
`summarise()` has grouped output by 'taxon'. You can override using the `.groups` argument.
n_samples_genera
n_samples_species = results %>%
mutate(taxon = str_extract(actual_labels,"(?<=species:)[^;]+")) %>%
filter(!is.na(taxon)) %>%
group_by(taxon, query_bp) %>%
summarize(N=n()) %>%
ungroup() %>%
complete(taxon, query_bp, fill = list(N=0))
`summarise()` has grouped output by 'taxon'. You can override using the `.groups` argument.
n_samples_species
n_samples_SRA = varKoder_SRA_results %>%
mutate(taxon = as.character(actual_labels)) %>%
group_by(taxon, query_bp) %>%
summarize(N=n()) %>%
ungroup() %>%
complete(taxon, query_bp, fill = list(N=0))
`summarise()` has grouped output by 'taxon'. You can override using the `.groups` argument.
n_samples_SRA
plot_Nsamples_area = function(df, title){
df = df %>%
mutate(query_bp = parse_number(query_bp) *1000)
n_levels <- length(unique(df$taxon))
viridis_colors <- viridis::turbo(n_levels)
half_n <- ceiling(n_levels / 2)
reordered_colors <- c(rbind(viridis_colors[1:half_n], viridis_colors[(half_n + 1):n_levels]))
ggplot(df, aes(x=query_bp,y=N,fill=taxon, color = taxon, group = taxon)) +
geom_area(position= position_stack()) +
#geom_line(position='stack') +
scale_fill_manual(values = reordered_colors,
aesthetics = c('colour','fill'),
guide = 'none') +
scale_x_log10(labels = scales::label_number(scale_cut = scales::cut_si('bp')),
breaks = 1000*parse_number(unique(n_samples_genera$query_bp)),
limits = 1000*range(parse_number(unique(n_samples_genera$query_bp)))) +
scale_y_continuous(n.breaks = 10, minor_breaks = waiver()) +
ggtitle(title) +
ylab('Number of samples') +
xlab('Base pairs in query images') +
theme_few() +
theme(axis.text.x = element_text(hjust=1,angle=45),
panel.background = element_rect(fill = NA),
panel.grid.major.y = element_line(colour = gray(0.5)),
panel.grid.minor.y = element_line(colour = gray(0.6),linetype = 2),
panel.ontop = TRUE)
}
N_species = plot_Nsamples_area(n_samples_species,title='Stigmaphyllon Species')
N_genera = plot_Nsamples_area(n_samples_genera,title='Maplighiaceae and Chrysobalanaceae Genera')
N_families = plot_Nsamples_area(n_samples_SRA,title='SRA familes')
Warning: number of columns of result is not a multiple of vector length (arg 2)
cowplot::plot_grid(N_genera,N_species,N_families, nrow = 1)
Total number of SRA samples. Validation:
read_csv('varKoder/all_SRA/varkoder_trained_model_ML/input_data.csv')[-1] %>%
group_by(is_valid) %>%
summarise(N = n())
Error: 'varKoder/all_SRA/varkoder_trained_model_ML/input_data.csv' does not exist in current working directory ('/Users/bruno/Documents/docs_macbookair2015/papers/working/CNN_spdelim/varKoder_development/varKoder_tests').